options(repos = c(CRAN = "https://cran.r-project.org"))
The purpose of CV is to find a better estimate of the “Prediction error” (or Prediction accuracy) measure \(\rightarrow\) these measures include MSE, SSE(RSS), misclassification rate etc from the Validation (testing data)
Better estimates lead to:
Jackknife estimator: remember to use both the original \(\hat\theta\) and the average across the observations ( \(\hat\theta_.\) ) to calculate the jackknife estimator \(\rightarrow\) they are two different things
When you are calculating bias, you are subtracting something not biased from something biased (in this case, \(\hat\theta\) - \(\hat\theta_{JK}\) )
ridge and lasso are shrinkage methods \(\rightarrow\) shrinkage methods and variable selection are regularization methods
under the shrinkage methods, we have biased \(\hat\beta\) but smaller standard error of \(\hat\beta\) \(\rightarrow\) MSE on \(\beta\) is E[ (\(\hat\beta\) - \(\beta\) )2 ], MSE on Y: E[( \(\hat{Y}\) - Y)2 ]
we don’t know if either of those MSEs improve with shrinkage \(\rightarrow\) whether or not we get improved prediction performance depends
These methods will be helpful when we have strong collinearity between predictors \(\rightarrow\) helps decrease MSE
We would expect a degree of collinearity unless the experiment is very well designed
With ridge regression and LASSO , we’re still trying to minimize RSS/SSE, but by using a constraint (see written notes for specifics) \(\rightarrow\) apply to scaled/standardized data so that all the \(\beta\)s are cancelled out and all the X’s are in the same units (if you use different units of X, you might end up with very different estimates for the coefficients, and you don’t want your coefficients to be influenced by the measurement units of X)
LASSO and ridge are related, the only difference is that ridge is on the squared scale (L2 norm) and the LASSO is on the absolute value scale (L1 norm) \(\rightarrow\) regardless we’re trying to minimize SSE/RSS, just with constraints
The fossil.dat data set in Canvas has data collected by
Dr. Timothy Bralower of the Penn State Department of Geosciences. It
consists of two variables measured on 106 fossil shells collected from
cores drilled in the floor of the ocean: the age of shell dated by the
surrounding material and the ratio of the appearance of isotopes of
Strontium (in strontium.ratio). It is believed that this ratio is driven
by sea level and is related to climate change. A scatter plot of the
data is below.
fossil <- read.table("./fossil.dat", header=T)
sx <- sort(fossil$age, index.return=T) # sorting here makes it easier to plot later
fossil <- fossil[sx$ix,]
attach(fossil)
## scatter plot of the data
plot(age, strontium.ratio, pch=20, xlab="age (in millions of years)")
poly.deg1 <- lm(strontium.ratio~age)
poly.deg2 <- lm(strontium.ratio~age+I(age^2))
poly.deg3 <- lm(strontium.ratio~age+I(age^2)+I(age^3))
poly.deg4 <- lm(strontium.ratio~age+I(age^2)+I(age^3)+I(age^4))
poly.deg5 <- lm(strontium.ratio~age+I(age^2)+I(age^3)+I(age^4)+I(age^5))
poly.deg6 <- lm(strontium.ratio~age+I(age^2)+I(age^3)+I(age^4)+I(age^5)+
I(age^6))
poly.deg12 <- lm(strontium.ratio~poly(age, 12))
# plots of polynomial fits
plot(age, strontium.ratio, pch=20,xlab="age (in millions of years)",
main="polynomial fits")
legtxt <- c("linear p=1","quadratic p=2","cubic p=3",
"quartic p=4","quintic p=5","degree 6 p=6", "degree 12")
legend('bottomleft',legtxt, col=1:7,lty=1:7,lwd=2)
lines(age, poly.deg1$fitted,lwd=2)
lines(age, poly.deg2$fitted,lty=2,col=2,lwd=2)
lines(age, poly.deg3$fitted,lty=3,col=3,lwd=2)
lines(age, poly.deg4$fitted,lty=4,col=4,lwd=2)
lines(age, poly.deg5$fitted,lty=5,col=5,lwd=2)
lines(age, poly.deg6$fitted,lty=6,col=6,lwd=2)
lines(age, poly.deg12$fitted,lty=7,col=7,lwd=2)
# model becomes more flexible as order increases --> when a model is more flexible, it gives better fit on the training data (but we don't want to make it too flexible because that might lead to overfitting)
Recall the cv.glm() function in boot
package.
library(boot)
cv.err <- rep(0, 15)
set.seed(2023)
for (p in 1:15){
poly.fit <- glm(strontium.ratio ~ poly(age, p), family="gaussian", data=fossil)
cv.err[p] <- cv.glm(poly.fit, data=fossil)$delta[1]
} # checking performance on testing/validation data
plot(cv.err, type="b", pch=20)
# when the order of the polynomial increases, the degrees of freedom increases --> cv error goes down dramatically until the 6th order, after which things stay the same, but eventually the cv error will go up again after a while
# this is the general expected behavior of a model with increasing flexibility
# anything in the sort of flat area can be the true order, but in practice, people prefer simpler models; before that we run into the risk of underfitting
Let’s cut the data into 4 segments according to the quarantines of
age.
age.quan <- quantile(age)
age.group <- cut(age, breaks = age.quan)
table(age.group)
## age.group
## (91.8,104] (104,109] (109,115] (115,123]
## 26 26 26 27
# put the data into a few small sections based on x values
plot(age, strontium.ratio, col=age.group, pch=20,
xlab="age (in millions of years)")
Broken lines with linear or quadratic function regression:
# piece-wise regression
plot(age, strontium.ratio, col=age.group, pch=20,
xlab="age (in millions of years)",
main = "Linear fit in each segment")
abline(v=age.quan[2:4], lty=2)
for (i in 1:4){
broken.lin <- lm(strontium.ratio ~ age, data=fossil, subset = (age.group == levels(age.group)[i]))
x <- c(age.quan[i], age.quan[i+1])
lines(x, predict(broken.lin, newdata=data.frame(age=x)), col=i, lwd=2)
}
# the regression pattern becomes discontinuous
plot(age, strontium.ratio, col=age.group, pch=20,
main = "Quardratic fit in each segment")
abline(v=age.quan[2:4], lty=2)
for (i in 1:4){
broken.lin <- lm(strontium.ratio ~ age + I(age^2), data=fossil, subset = age.group == levels(age.group)[i])
x <- seq(age.quan[i], age.quan[i+1], by=0.1)
lines(x, predict(broken.lin, newdata=data.frame(age=x)), col=i, lwd=2)
}
Continuous lines (splines):
# fit so that the line is continuous --> must meet two basic requirements: lines must be connected and SMOOTH
# look at deriviative from the left side and the first point of the right side --> first derivatives must be equal
n <- nrow(fossil)
basis1 <- matrix(NA, ncol=4, nrow=n)
par(mfrow=c(1, 2))
for (i in 1:4){
basis1[, i] <- ifelse((age > age.quan[i]), yes=age-age.quan[i], no=0)
}
colnames(basis1) <- c("b1", "b2", "b3", "b4")
matplot(age, basis1, type='l', lwd=2, xlab="x", ylab="Linear Splines",
main="Basis functions")
abline(v=age.quan[2:4], lty=2)
pw.lin <- lm(strontium.ratio ~ basis1)
plot(age, strontium.ratio, pch=20, col=age.group,
xlab="age (in millions of years)",
main="Piece-wise linear fit")
lines(age, pw.lin$fitted, lwd=2)
rug(age)
abline(v=age.quan[2:4], lty=2)
Y = \(\beta_0\) + \(\beta_1\) * \(b_1\)(x) + \(\beta_2\) * \(b_2\)(x) + … + error \(\rightarrow\) basis functions are not unique here; there are more than one ways to write a basis function and your coefficients will just change accordingly
the actual response contains the error term, but predictions don’t need to include the error term in calculations
The exact form of the B-spline (basis-spline) is derived in a recursive way and may be hard to write out analytically. It can be considered as piece-wise polynomial splines at various degrees.
Functions bs() in package splines creates
the basis function of the B-splines.
Argument “knots =” defines the knots.
Argument “degree =” defines the (highest) order of
the splines. (Cubic, by default.)
After creating the splines, use lm() or
glm() functions to fit the regression model.
install.packages("splines")
## Warning: package 'splines' is a base package, and should not be updated
library(splines)
par(mfrow=c(1, 2))
bsage1 <- bs(age, knots=age.quan[2:4], degree=1) # you need to tell where the knots will be as well as the degree
matplot(age, bsage1, type='l', lwd=2, xlab="x", ylab="Basis Function", # these are the basis functions
main="Linear B-spline")
abline(v=age.quan[2:4], lty=2)
bspline.fit1 <- lm(strontium.ratio~bsage1) # running linear regression using the basis functions as predictors
bspline.fit1$coefficients # we aren't particularly interested in coefficients in practice; we are more interested in the plot itself
## (Intercept) bsage11 bsage12 bsage13 bsage14
## 7.073891e-01 6.719856e-05 -4.048016e-05 -1.684489e-04 1.197341e-04
plot(age,strontium.ratio,pch=20,xlab="age (in millions of years)",
main="Linear B-spline fit", col=age.group)
lines(age, bspline.fit1$fitted,lwd=2)
abline(v=age.quan[2:4], lty=2)
# these are guaranteed continuous, but the linear fit does not guarantee smoothness
knots <- age.quan[2:4]
par(mfrow=c(1, 2))
bsage2 <- bs(age, knots=knots, degree=2)
matplot(age, bsage2, type='l', lwd=2, xlab="x", ylab="Basis Function",
main="Quardratic B-spline")
abline(v=knots, lty=2)
bspline.fit2 <- lm(strontium.ratio~bsage2)
bspline.fit2$coefficients
## (Intercept) bsage21 bsage22 bsage23 bsage24
## 7.073910e-01 2.662466e-05 7.080513e-05 -1.840711e-04 -4.837951e-05
## bsage25
## 1.161913e-04
# helps with smoothness, but not as much as cubic
plot(age,strontium.ratio,pch=20,xlab="age (in millions of years)",
main="Quardratic B-spline fit", col=age.group)
lines(age, bspline.fit2$fitted,lwd=2)
abline(v=knots, lty=2)
knots <- age.quan[2:4] # knots set using quantiles
par(mfrow=c(1, 2))
bsage3 <- bs(age, knots=knots, degree=3)
matplot(age, bsage3, type='l', lwd=2, xlab="x", ylab="Basis function",
main="Cubic B-spline")
abline(v=knots, lty=2)
bspline.fit3 <- lm(strontium.ratio~bsage3)
bspline.fit3$coefficients
## (Intercept) bsage31 bsage32 bsage33 bsage34
## 7.073775e-01 5.292379e-05 9.588816e-05 9.179065e-06 -3.088446e-04
## bsage35 bsage36
## 1.481660e-04 7.909088e-05
# cubic b sline helps fix the smoothness, but it uses more parameters
# in a b-spline, degree = p (polynomial degree), K knots --> not counting the number of observations; we have K + p + 1 where 1 is the intercept. these are the parameters in the model.
# K and p can have a trade off, but the most common approach is to use a cubic spline and then you can tune the number and location of the knots --> the higher the number of knots, the more flexible the model will be --> prediction error may be too high with an overly flexible model, that's why we use CV to tune the number and location of knots
plot(age,strontium.ratio,pch=20,xlab="age (in millions of years)",
main="Cubic B-spline fit", col=age.group)
lines(age, bspline.fit3$fitted,lwd=2)
abline(v=knots, lty=2)
We can conduct cross-validation to assess different choice of knots and/or how many knots.
Note that, the more knots we use, the more basis functions we need. Hence the model becomes more flexible.
library(boot)
poly.deg6 <- glm(strontium.ratio ~ age)
bs3knots.fit <- glm(strontium.ratio ~ bs(age, knots = age.quan[2:4], degree=3),
family="gaussian")
bs6knots.fit <- glm(strontium.ratio ~ bs(age, knots = c(94.08, 104.7, 108.6,
110.6, 115.2, 119.7), # changing the number and location of knots
degree=3),
family="gaussian")
cv.glm(data=fossil, poly.deg6, K=10)$delta[1] # this K is different for K for knots, this is 10 fold CV
## [1] 5.903382e-09
# we have 7 params: beta0 + beta1*x + beta2*x^2 +...+ beta6*x^6
cv.glm(data=fossil, bs3knots.fit, K=10)$delta[1] # also 7 parameters but smaller error; 3 knots + 3 degrees of the polynomial + 1 (intercept) = 7
## Warning in bs(age, degree = 3L, knots = c(`25%` = 104.43358625, `50%` =
## 109.477, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(age, degree = 3L, knots = c(`25%` = 104.43358625, `50%` =
## 109.477, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(age, degree = 3L, knots = c(`25%` = 104.43358625, `50%` =
## 109.477, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(age, degree = 3L, knots = c(`25%` = 104.43358625, `50%` =
## 109.477, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## [1] 8.458514e-10
cv.glm(data=fossil, bs6knots.fit, K=10)$delta[1]
## Warning in bs(age, degree = 3L, knots = c(94.08, 104.7, 108.6, 110.6, 115.2, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(age, degree = 3L, knots = c(94.08, 104.7, 108.6, 110.6, 115.2, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(age, degree = 3L, knots = c(94.08, 104.7, 108.6, 110.6, 115.2, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(age, degree = 3L, knots = c(94.08, 104.7, 108.6, 110.6, 115.2, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## [1] 9.38517e-10
The B-spline is the fundation of other popular spline methods
Function ns() fits Natural (Cubic) Splines to the data.
The Natural (Cubic) Spline uses (cubic) B-splines, but with more
constrains for boundary segments. E.g., linear spline for the first and
the last segments.
par(mfrow=c(1, 2))
ns3knots <- ns(age, knots = age.quan[2:4])
matplot(age, ns3knots, type='l', lwd=2, xlab="x", ylab="Basis function",
main="Natural spline")
abline(v=knots, lty=2)
ns3knots.fit <- glm(strontium.ratio ~ ns(age, knots = c(104.43, 109.48, 115.41)),
family="gaussian")
plot(age, strontium.ratio, pch=20,xlab="age (in millions of years)",
main="Natural-spline fit", col=age.group)
lines(age, ns3knots.fit$fitted,lwd=2)
abline(v=knots, lty=2)
Function smooth.spline() fits smoothing splines. You may
consider smoothing splines as B-splines with many knots and a penalty on
the “roughness” (i.e., the “bumps” in the plots, or, more precisely,
g″(x)=d2g(x)/dx2𝑔″(𝑥)=𝑑2𝑔(𝑥)/𝑑𝑥2).
Use one of the following argument to set the model flexibility.
df =: degrees of freedom between 1 and (number of
unique X-values). Large df = value makes the splines less
smooth but more flexible. (Recall flexible models tend to have smaller
bias, larger variance, and higher risk of overfitting.)
spar =: smooth parameter, typically between 0 and 1.
Large spar = value makes the splines smoother, but less
flexible. (Recall inflexible or restrictive models tend to have larger
bias, smaller variance, and higher risk of underfitting.)
lambda =: penalty parameter. It depends on the scale
of the predictor. Large lambda = value makes the splines
smoother, but less flexible.
ss.df5 <- smooth.spline(x=age, y=strontium.ratio, df=5) # when you write this model, you don't use y ~, just specificy the x, y, and df
ss.df15 <- smooth.spline(x=age, y=strontium.ratio, df=15)
ss.df25 <- smooth.spline(x=age, y=strontium.ratio, df=25)
plot(age, strontium.ratio, pch=20,xlab="age (in millions of years)",
main="Smoothing-spline fit")
lines(ss.df5, col="red", lwd=2)
lines(ss.df15, col="blue", lwd=2)
lines(ss.df25, col="orange", lwd=2)
legend('bottomleft', legend = c("Df = 5", "Df = 15", "Df = 25"), col=c("red", "blue", "orange"),
lwd=2)
cv = TRUE computes LOOCV. For regression, the
prediction Sum of Squared Errors (PRESS) will be calculated.cv.press <- rep(NA, 25)
for (i in 2:length(cv.press)){
cv.press[i] <- smooth.spline(x=age, y=strontium.ratio, df=i, cv = TRUE)$cv.crit
}
plot(cv.press, type="b", pch=20)
# df = 10 seems to be best
# in general, splines are best for when there is only one predictor. when there are more, you use generalized additive model
which.min(cv.press)
## [1] 15
plot(age , strontium.ratio, pch=20, xlab="age (in millions of years)",
main= "Local Regression")
fit <- loess(strontium.ratio ~ age , span = .2, data = fossil)
fit2 <- loess(strontium.ratio ~ age , span = .5, data = fossil)
age.grid <- seq(min(age), max(age), by=diff(range(age))/200)
lines(age.grid, predict(fit , data.frame(age = age.grid)), col = "red", lwd = 2)
lines(age.grid, predict(fit2 , data.frame(age = age.grid)), col = "blue", lwd = 2)
legend("bottomleft", legend = c("Span = 0.2", "Span = 0.5"),
col = c("red", "blue"), lty = 1, lwd = 2)
install.packages("gam")
##
## The downloaded binary packages are in
## /var/folders/3w/r54r07ws7z76wprt6939zc040000gn/T//RtmpQls8VG/downloaded_packages
library(gam)
## Loading required package: foreach
## Loaded gam 1.22-3
? gam()
Recall the Auto data set in the ISLR2
package. This data frame has 392 observations on the following 9
variables.
mpg: miles per gallon
cylinders: Number of cylinders between 4 and
8
displacement: Engine displacement (cu.
inches)
horsepower: Engine horsepower
weight: Vehicle weight (lbs.)
acceleration: Time to accelerate from 0 to 60 mph
(sec.)
year: Model year (modulo 100)
origin: Origin of car (1. American, 2. European, 3.
Japanese)
name: Vehicle name
library(ISLR2)
colnames(Auto)
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name"
auto.data <- Auto
auto.data$country <- factor(auto.data$origin, labels = c("American", "European",
"Japanese"))
library("ggplot2")
library("GGally")
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(auto.data, columns = 1:5, ggplot2::aes(colour=country)) + theme_bw()
# install.packages("car")
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:boot':
##
## logit
mpg.lm <- lm(mpg ~ cylinders + displacement + horsepower + weight +
acceleration + year, data=auto.data )
vif(mpg.lm)
## cylinders displacement horsepower weight acceleration year
## 10.633049 19.641683 9.398043 10.731681 2.625581 1.244829
mpg.lm2 <- lm(mpg ~ cylinders + displacement + horsepower + weight +
acceleration + year + country, data=auto.data )
vif(mpg.lm2) # With categorical predictors
## GVIF Df GVIF^(1/(2*Df))
## cylinders 10.737771 1 3.276854
## displacement 22.937950 1 4.789358
## horsepower 9.957265 1 3.155513
## weight 11.074349 1 3.327814
## acceleration 2.625906 1 1.620465
## year 1.301373 1 1.140777
## country 2.096060 2 1.203236
Function step() conducts stepwise for linear model and
generalized linear models using AIC or BIC.
mpg.lm2
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration + year + country, data = auto.data)
##
## Coefficients:
## (Intercept) cylinders displacement horsepower
## -17.95460 -0.48971 0.02398 -0.01818
## weight acceleration year countryEuropean
## -0.00671 0.07910 0.77703 2.63000
## countryJapanese
## 2.85323
step(mpg.lm2, direction="both") # AIC
## Start: AIC=946.48
## mpg ~ cylinders + displacement + horsepower + weight + acceleration +
## year + country
##
## Df Sum of Sq RSS AIC
## - acceleration 1 7.09 4194.5 945.14
## - horsepower 1 19.24 4206.6 946.28
## <none> 4187.4 946.48
## - cylinders 1 25.41 4212.8 946.85
## - displacement 1 107.32 4294.7 954.40
## - country 2 355.96 4543.3 974.46
## - weight 1 1147.04 5334.4 1039.39
## - year 1 2461.64 6649.0 1125.74
##
## Step: AIC=945.14
## mpg ~ cylinders + displacement + horsepower + weight + year +
## country
##
## Df Sum of Sq RSS AIC
## <none> 4194.5 945.14
## - cylinders 1 26.85 4221.3 945.64
## + acceleration 1 7.09 4187.4 946.48
## - horsepower 1 58.80 4253.3 948.60
## - displacement 1 102.96 4297.4 952.65
## - country 2 357.11 4551.6 973.17
## - weight 1 1372.52 5567.0 1054.11
## - year 1 2455.71 6650.2 1123.81
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## year + country, data = auto.data)
##
## Coefficients:
## (Intercept) cylinders displacement horsepower
## -16.33231 -0.50277 0.02337 -0.02500
## weight year countryEuropean countryJapanese
## -0.00646 0.77388 2.63452 2.85736
To use BIC as the selection criteria, set argument
k = (sample size). Though the output still lists “AIC =”,
it is actually the BIC value.
step(mpg.lm2, direction="both", k = log(length(mpg.lm2$fitted.values)))
## Start: AIC=982.22
## mpg ~ cylinders + displacement + horsepower + weight + acceleration +
## year + country
##
## Df Sum of Sq RSS AIC
## - acceleration 1 7.09 4194.5 976.91
## - horsepower 1 19.24 4206.6 978.05
## - cylinders 1 25.41 4212.8 978.62
## <none> 4187.4 982.22
## - displacement 1 107.32 4294.7 986.17
## - country 2 355.96 4543.3 1002.26
## - weight 1 1147.04 5334.4 1071.16
## - year 1 2461.64 6649.0 1157.51
##
## Step: AIC=976.91
## mpg ~ cylinders + displacement + horsepower + weight + year +
## country
##
## Df Sum of Sq RSS AIC
## - cylinders 1 26.85 4221.3 973.44
## - horsepower 1 58.80 4253.3 976.40
## <none> 4194.5 976.91
## - displacement 1 102.96 4297.4 980.45
## + acceleration 1 7.09 4187.4 982.22
## - country 2 357.11 4551.6 997.00
## - weight 1 1372.52 5567.0 1081.91
## - year 1 2455.71 6650.2 1151.61
##
## Step: AIC=973.44
## mpg ~ displacement + horsepower + weight + year + country
##
## Df Sum of Sq RSS AIC
## - horsepower 1 50.63 4272.0 972.15
## <none> 4221.3 973.44
## - displacement 1 79.90 4301.2 974.82
## + cylinders 1 26.85 4194.5 976.91
## + acceleration 1 8.53 4212.8 978.62
## - country 2 342.93 4564.3 992.12
## - weight 1 1437.29 5658.6 1082.34
## - year 1 2462.30 6683.6 1147.60
##
## Step: AIC=972.15
## mpg ~ displacement + weight + year + country
##
## Df Sum of Sq RSS AIC
## - displacement 1 38.44 4310.4 969.69
## <none> 4272.0 972.15
## + horsepower 1 50.63 4221.3 973.44
## + acceleration 1 44.74 4227.2 973.99
## + cylinders 1 18.68 4253.3 976.40
## - country 2 296.95 4568.9 986.55
## - weight 1 1620.18 5892.1 1092.22
## - year 1 2729.74 7001.7 1159.85
##
## Step: AIC=969.69
## mpg ~ weight + year + country
##
## Df Sum of Sq RSS AIC
## <none> 4310.4 969.69
## + displacement 1 38.4 4272.0 972.15
## + acceleration 1 9.5 4300.9 974.79
## + horsepower 1 9.2 4301.2 974.82
## + cylinders 1 1.1 4309.3 975.56
## - country 2 258.5 4569.0 980.58
## - year 1 2786.7 7097.1 1159.19
## - weight 1 5712.8 10023.2 1294.51
##
## Call:
## lm(formula = mpg ~ weight + year + country, data = auto.data)
##
## Coefficients:
## (Intercept) weight year countryEuropean
## -18.306944 -0.005887 0.769849 1.976306
## countryJapanese
## 2.214534
Remarks:
See the help file for other arguments in step(). For
example, you can specify the scope of the
selection.
Other packages may have their own stepwise selection function.
E.g., stepAIC() in pacakge MASS.
Use regsubsets() from package {leaps}.
# install.packages("leaps")
library(leaps)
mpg.sub <- regsubsets(mpg ~ cylinders + displacement + horsepower +
weight + acceleration + year + country, auto.data)
summary(mpg.sub)
## Subset selection object
## Call: regsubsets.formula(mpg ~ cylinders + displacement + horsepower +
## weight + acceleration + year + country, auto.data)
## 8 Variables (and intercept)
## Forced in Forced out
## cylinders FALSE FALSE
## displacement FALSE FALSE
## horsepower FALSE FALSE
## weight FALSE FALSE
## acceleration FALSE FALSE
## year FALSE FALSE
## countryEuropean FALSE FALSE
## countryJapanese FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## cylinders displacement horsepower weight acceleration year
## 1 ( 1 ) " " " " " " "*" " " " "
## 2 ( 1 ) " " " " " " "*" " " "*"
## 3 ( 1 ) " " " " " " "*" " " "*"
## 4 ( 1 ) " " " " " " "*" " " "*"
## 5 ( 1 ) " " "*" " " "*" " " "*"
## 6 ( 1 ) " " "*" "*" "*" " " "*"
## 7 ( 1 ) "*" "*" "*" "*" " " "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" "*"
## countryEuropean countryJapanese
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " "*"
## 4 ( 1 ) "*" "*"
## 5 ( 1 ) "*" "*"
## 6 ( 1 ) "*" "*"
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
For models with the same number of parameters (𝑝), the “best” model is selected based on the the lowest Sum of Squared Residuals (aka. SSE, Residual Sum of Squares, RSS).
Use desired selection criteria to determine the optimal number of parameters (𝑝).
mpg.subsum <- summary(mpg.sub)
mpg.subsum$adjr2
## [1] 0.6918423 0.8071941 0.8107755 0.8171643 0.8183256 0.8200125 0.8206916
## [8] 0.8205274
mpg.subsum$bic
## [1] -450.5016 -629.3564 -631.7442 -640.2482 -637.7889 -636.4914 -633.0215
## [8] -627.7135
mpg.subsum$cp
## [1] 281.637026 31.899439 25.082435 12.251831 10.735481 8.104512 7.648634
## [8] 9.000000
par(mfrow=c(1, 2))
plot(mpg.sub)
plot(mpg.sub, scale = "adjr2")
glmnetFunction glmnet() in package glmnet can fit
both Ridge regression and LASSO (least absolute shrinkage and selection
operator). The package also has a function cv.glmnet() that
conducts K-fold cross-validation.
# install.packages("glmnet")
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
For Ridge regression and LASSO, glmnet() and
cv.glmnet() need, at least, the following arguments:
x: the matrix of predictors terms. (The design matrix of the linear model, but without the column of 1s.)
y: the vector of response.
alpha: 0 for Ridge regression, 1 for LASSO.
lambda: often a decreasing sequence of positive numbers.
glmnet(x, y, alpha=0, lambda)the alpha controls whether we’re using LASSO or ridge
mpg.lm2
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration + year + country, data = auto.data)
##
## Coefficients:
## (Intercept) cylinders displacement horsepower
## -17.95460 -0.48971 0.02398 -0.01818
## weight acceleration year countryEuropean
## -0.00671 0.07910 0.77703 2.63000
## countryJapanese
## 2.85323
auto.X <- model.matrix(mpg.lm2)[, -1] # Remove the first 1 column of 1's
dim(auto.X)
## [1] 392 8
auto.ridge <- glmnet(x = auto.X, y=auto.data$mpg, alpha=0, lambda = seq(10, 0, by= -0.1)) # can set a range of lambda
plot(auto.ridge)
- What’s in the above plot?
Is it really L1 Norm?
alpha=0 in the glmnet()
function. The horizontal axis is the L2 Norm ∑pj=1(βj)2∑𝑗=1𝑝(𝛽𝑗)2.cv.glmnet()set.seed(2023)
auto.ridgeCV <- cv.glmnet(x=auto.X, y = auto.data$mpg, alpha=0,
lambda=seq(10, 0, by=-0.1))
auto.ridgeCV
##
## Call: cv.glmnet(x = auto.X, y = auto.data$mpg, lambda = seq(10, 0, by = -0.1), alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0 101 11.35 0.9068 8
## 1se 1 91 12.19 1.1264 8
auto.ridgeCV <- cv.glmnet(x=auto.X, y = auto.data$mpg, alpha=0,
lambda=seq(10, 0, by=-0.01))
auto.ridgeCV
##
## Call: cv.glmnet(x = auto.X, y = auto.data$mpg, lambda = seq(10, 0, by = -0.01), alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.04 997 11.43 0.9461 8
## 1se 1.19 882 12.37 1.2189 8
names(auto.ridgeCV)
## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"
## [6] "nzero" "call" "name" "glmnet.fit" "lambda.min"
## [11] "lambda.1se" "index"
plot(auto.ridgeCV)
predict(..., type="coefficients")lambda.opt <- auto.ridgeCV$lambda.min
lambda.opt
## [1] 0.04
predict(auto.ridge, s = lambda.opt, type="coefficients")
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -17.306347741
## cylinders -0.433900198
## displacement 0.019984317
## horsepower -0.019202445
## weight -0.006350692
## acceleration 0.058553699
## year 0.766611471
## countryEuropean 2.517553514
## countryJapanese 2.796552339
predict(..., type="response")# Generate a "new" set of X-value for the sample.
auto.new <- auto.X[sample(nrow(auto.X), 3), ]
dim(auto.new)
## [1] 3 8
lambda.opt
## [1] 0.04
predict(auto.ridge, newx=auto.new, s=lambda.opt, type="response")
## s1
## 231 16.19851
## 258 23.41549
## 336 30.56023
Set alpha=1 in glmnet() and
cv.glmnet() to fit LASSO.
head(auto.X, 2)
## cylinders displacement horsepower weight acceleration year countryEuropean
## 1 8 307 130 3504 12.0 70 0
## 2 8 350 165 3693 11.5 70 0
## countryJapanese
## 1 0
## 2 0
auto.lasso <- glmnet(x = auto.X, y=auto.data$mpg, alpha=1, lambda = seq(10, 0, by= -0.1))
par(mfrow=c(1, 2))
plot(auto.ridge, main="Ridge regression")
plot(auto.lasso, main="LASSO")
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
- Compare the above plots:
Which one is really using L1 Norm?
What are the similarities?
What is the main difference?
Fitting the model, obtaining the regression coefficients and predicting the response.
set.seed(2023)
auto.lassoCV <- cv.glmnet(x=auto.X, y = auto.data$mpg, alpha=1,
lambda=seq(10, 0, by=-0.001))
auto.lassoCV
##
## Call: cv.glmnet(x = auto.X, y = auto.data$mpg, lambda = seq(10, 0, by = -0.001), alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.006 9995 11.34 0.9191 8
## 1se 0.551 9450 12.26 1.1264 5
plot(auto.lassoCV)
lambda.opt <- auto.lassoCV$lambda.min
lambda.opt
## [1] 0.006
predict(auto.lasso, s = lambda.opt, type="coefficients")
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -17.800644562
## cylinders -0.449440589
## displacement 0.022357124
## horsepower -0.017541460
## weight -0.006647528
## acceleration 0.074342410
## year 0.774185494
## countryEuropean 2.562802913
## countryJapanese 2.794885759
auto.new
## cylinders displacement horsepower weight acceleration year countryEuropean
## 231 8 350 170 4165 11.4 77 0
## 258 6 232 90 3210 17.2 78 0
## 336 4 122 88 2500 15.1 80 1
## countryJapanese
## 231 0
## 258 0
## 336 0
predict(auto.lasso, newx=auto.new, s=lambda.opt, type="response")
## s1
## 231 16.21961
## 258 23.43743
## 336 30.58691
lm.ridge() in package MASSlibrary(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
auto.ridge2 <- lm.ridge(mpg ~ cylinders + displacement + horsepower + weight +
acceleration + year + country, data = auto.data,
lambda = seq(0, 100, 0.01))
plot(auto.ridge2)
auto.ridge2 <- lm.ridge(mpg ~ cylinders + displacement + horsepower + weight +
acceleration + year + country, data = auto.data,
lambda = seq(0, 4, 0.001))
select(auto.ridge2)
## modified HKB estimator is 1.30179
## modified L-W estimator is 1.309865
## smallest value of GCV at 0.929
plot(auto.ridge2$lambda, auto.ridge2$GCV)